library(MASS)
data(Boston)
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_scaled = scale(Boston)
str(boston_scaled)
##  num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:506] "1" "2" "3" "4" ...
##   ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
##  - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
##   ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
##  - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
##   ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)

n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n,  size = n * 0.8)

# create train set
train <- boston_scaled[ind,]

# create test set 
test <- boston_scaled[-ind,]

# save the correct classes from test data
correct_classes <- test$crime

# remove the crime variable from test data
test <- dplyr::select(test, -crime)

# LDA
lda.fit <- lda(crime ~ ., data = train)

# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2574257 0.2326733 0.2623762 0.2475248 
## 
## Group means:
##                  zn      indus        chas        nox          rm        age
## low       0.8588545 -0.8980181 -0.12090214 -0.8444985  0.47282047 -0.8022219
## med_low  -0.0518548 -0.3134681  0.06274329 -0.6043089 -0.07656227 -0.4435540
## med_high -0.3966319  0.2589551  0.21052285  0.4037540 -0.01564562  0.4084285
## high     -0.4872402  1.0171519 -0.07547406  1.0287999 -0.41182369  0.8034463
##                 dis        rad        tax    ptratio       black       lstat
## low       0.8271116 -0.7024845 -0.7535811 -0.3929549  0.37757208 -0.75178515
## med_low   0.4402709 -0.5444763 -0.4710934 -0.1381793  0.34131759 -0.20857490
## med_high -0.3697539 -0.4282234 -0.2884499 -0.2413524  0.04496343  0.06298321
## high     -0.8547941  1.6377820  1.5138081  0.7803736 -0.94029858  0.91353880
##                 medv
## low       0.53213671
## med_low   0.04721214
## med_high  0.07397994
## high     -0.72444611
## 
## Coefficients of linear discriminants:
##                  LD1          LD2         LD3
## zn       0.098781533  0.525306859 -0.99942228
## indus    0.029911155 -0.333004741  0.39681899
## chas    -0.084237148 -0.074520107  0.18346408
## nox      0.352927842 -0.820834305 -1.19899518
## rm      -0.086810309 -0.009511551 -0.09432911
## age      0.229795060 -0.210561205 -0.35902832
## dis     -0.062562559 -0.165654465  0.22071162
## rad      3.156749857  0.978399363 -0.14491531
## tax      0.006663194  0.016130427  0.69239722
## ptratio  0.139205135 -0.063029371 -0.42187778
## black   -0.133174185  0.014375361  0.16161958
## lstat    0.244485522 -0.231722183  0.35534634
## medv     0.199973676 -0.405706802 -0.21375527
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9518 0.0362 0.0120
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

# target classes as numeric
classes <- as.numeric(train$crime)

# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)

# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       19       4        0    0
##   med_low    3      18       11    0
##   med_high   0       1       18    1
##   high       0       0        0   27
# load MASS and Boston
library(MASS)
library(ggplot2)
data('Boston')

Boston <- scale(Boston)

# euclidean distance matrix
dist_eu <- dist(Boston)

# look at the summary of the distances
summary(dist_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970
# manhattan distance matrix
dist_man <- dist(Boston, method = 'manhattan')

# look at the summary of the distances
summary(dist_man)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2662  8.4832 12.6090 13.5488 17.7568 48.8618
# k-means clustering
km <-kmeans(Boston, centers = 3)

# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)

set.seed(123)

# determine the number of clusters
k_max <- 10

# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})

# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')

data(Boston)
Boston <- as.data.frame(scale(Boston))
km <- kmeans(Boston, centers = 5)
Boston$km <- km$cluster
lda.fit <- lda(km~., Boston)
classes <- as.numeric(Boston$km)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

library(MASS)
data(Boston)
boston_scaled = scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
ind <- sample(n,  size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)

# LDA
lda.fit <- lda(crime ~ ., data = train)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = as.numeric(train$crime))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.